In [31]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import pymc3 as pm
import pandas as pd
%matplotlib inline
sns.set(font_scale=1.5)
In [32]:
tips = sns.load_dataset('tips')
tips.tail()
Out[32]:
In [33]:
sns.violinplot(x='day', y='tip', data=tips)
Out[33]:
In [34]:
y = tips['tip'].values
idx = pd.Categorical(tips['day']).codes
In [35]:
with pm.Model() as comparing_groups:
means = pm.Normal('means', mu=0, sd=10, shape=len(set(idx)))
sds = pm.HalfNormal('sds', sd=10, shape=len(set(idx)))
ymod = pm.Normal('y', mu=means[idx], sd=sds[idx], observed=y)
trace_cg = pm.sample(5000, chains=4)
chain_cg = trace_cg[100::]
In [36]:
pm.traceplot(chain_cg, compact=True);
In [37]:
pm.summary(chain_cg)
Out[37]:
In [40]:
pm.plot_posterior(chain_cg);
In [38]:
dist = dist = stats.norm()
_, ax = plt.subplots(3, 2, figsize=(16, 12))
comparisons = [(i,j) for i in range(4) for j in range(i+1, 4)]
pos = [(k,l) for k in range(3) for l in (0, 1)]
for (i, j), (k,l) in zip(comparisons, pos):
means_diff = chain_cg['means'][:,i]-chain_cg['means'][:,j]
d_cohen = (means_diff / np.sqrt((chain_cg['sds'][:,i]**2 + chain_cg['sds'][:,j]**2) / 2)).mean()
ps = dist.cdf(d_cohen/(2**0.5))
pm.plot_posterior(means_diff, ref_val=0, ax=ax[k, l],
color='skyblue')
ax[k, l].plot(0, label="Cohen's d = {:.2f}\nProb sup = {:.2f}".format(d_cohen, ps) ,alpha=0)
ax[k, l].set_xlabel('$\mu_{}-\mu_{}$'.format(i, j),
fontsize=18)
ax[k,l ].legend(loc='upper right', fontsize=14)
plt.tight_layout()
In [41]:
with pm.Model() as comparing_groups:
mu = pm.Flat('flat')
means = pm.Normal('means', mu=mu, sd=10, shape=len(set(idx)))
sds = pm.HalfNormal('sds', sd=10, shape=len(set(idx)))
ymod = pm.Normal('y', mu=means[idx], sd=sds[idx], observed=y)
trace_cg = pm.sample(5000, chains=4)
chain_cg = trace_cg[100::]
In [42]:
pm.traceplot(chain_cg, compact=True);
In [43]:
pm.summary(chain_cg)
Out[43]:
In [44]:
dist = dist = stats.norm()
_, ax = plt.subplots(3, 2, figsize=(16, 12))
comparisons = [(i,j) for i in range(4) for j in range(i+1, 4)]
pos = [(k,l) for k in range(3) for l in (0, 1)]
for (i, j), (k,l) in zip(comparisons, pos):
means_diff = chain_cg['means'][:,i]-chain_cg['means'][:,j]
d_cohen = (means_diff / np.sqrt((chain_cg['sds'][:,i]**2 + chain_cg['sds'][:,j]**2) / 2)).mean()
ps = dist.cdf(d_cohen/(2**0.5))
pm.plot_posterior(means_diff, ref_val=0, ax=ax[k, l],
color='skyblue')
ax[k, l].plot(0, label="Cohen's d = {:.2f}\nProb sup = {:.2f}".format(d_cohen, ps) ,alpha=0)
ax[k, l].set_xlabel('$\mu_{}-\mu_{}$'.format(i, j),
fontsize=18)
ax[k,l ].legend(loc='upper right', fontsize=14)
plt.tight_layout()
In [45]:
y_pred = pm.sample_ppc(chain_cg, 100, comparing_groups, size=len(y))
In [48]:
y_pred['y'].shape
Out[48]:
In [61]:
for i in set(idx):
sns.distplot(y[idx==i])
sns.distplot(y_pred['y'].flatten())
Out[61]:
In [ ]:
In [ ]:
In [ ]: